Grab Harry Potter Text

In [2]:
# http://www.glozman.com/textpages.html
# Harry Potter 1 - Sorcerer's Stone.txt
# Harry Potter 2 - Chamber of Secrets.txt
# Harry Potter 3 - The Prisoner Of Azkaban.txt
# Harry Potter 4 - The Goblet Of Fire.txt
# Harry Potter 5 - Order of the Phoenix.txt
# Harry Potter 6 - The Half Blood Prince.txt
# Harry Potter 7 - Deathly Hollows.txt

In [3]:
with open("texts/HarryPotter1-SorcerersStone.txt", "r") as f:
    text = f.read().lower()

In [4]:
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
'corpus length: {}  total chars: {}'.format(len(text), len(chars))

'corpus length: 442745  total chars: 54'

In [5]:

harry potter and the sorcerer's stone 

chapter one 

the boy who lived 

mr. and mrs. dursley, of n

Create the Training set

Build a training and test dataset. Take 40 characters and then save the 41st character. We will teach the model that a certain 40 char sequence should generate the 41st char. Use a step size of 3 so there is overlap in the training set and we get a lot more 40/41 samples.

In [6]:
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i + maxlen])
print("sequences: ", len(sentences))

sequences:  147569

In [7]:

harry potter and the sorcerer's stone 

ry potter and the sorcerer's stone 


In [8]:


One-hot encode

In [9]:
import numpy as np

X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Create the Model

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop

model = Sequential()
model.add(LSTM(256, recurrent_dropout=0.0, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(LSTM(256, recurrent_dropout=0.0, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(LSTM(256, recurrent_dropout=0.0,  input_shape=(maxlen, len(chars))))
optimizer = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


Model: "sequential_2"
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 40, 256)           318464    
lstm_2 (LSTM)                (None, 40, 256)           525312    
lstm_3 (LSTM)                (None, 256)               525312    
dense_1 (Dense)              (None, 108)               27756     
dense_2 (Dense)              (None, 54)                5886      
activation_1 (Activation)    (None, 54)                0         
Total params: 1,402,730
Trainable params: 1,402,730
Non-trainable params: 0

Train the Model

In [54]:
epochs = 100
batch_size = 512

model.fit(X, y, batch_size=batch_size, epochs=epochs)

<keras.callbacks.callbacks.History at 0x1f08a3d7a90>

In [55]:
# model.save_weights("potter_lstm_weights_0568.h5")

Generate new sequence

In [11]:

In [12]:
import random

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [13]:
import sys
start_index = random.randint(0, len(text) - maxlen - 1)
for diversity in [0.2, 0.5, 1.0]:
    print('----- diversity:', diversity)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    for i in range(400):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        generated += next_char
        sentence = sentence[1:] + next_char

----- diversity: 0.2
----- Generating with seed: "s stupid, fat rat yellow." 

he waved hi"
s stupid, fat rat yellow." 

he waved his wifd, forg
otte robblows and petunia field what he mied on its mind. the carent see than all day. it was hermione near thas he wooded mountain troll. and mone. o'd all the hat if i cold him to his face." 

"look done this me somethin' all sitten yer muties." 

he said, "ei'll can to expections chisers. ust past whise-here. it was only anyone found off his mother, they're not all in be a sholl tim

----- diversity: 0.5
----- Generating with seed: "s stupid, fat rat yellow." 

he waved hi"
s stupid, fat rat yellow." 

he waved his wifd, because they weren't starting and there of from the mirror of erised quietly, bown here his lapped in a wazm. "he are yound mation in the school, "i'm said! ron and hermione. 

harry put his lang stopper meseley twiffing out with his really. he took a low ron dripped it fride mirs charlew hermione waited hather had even senkoned out of earing untious footstoppisdowed pleated on the ribbsar

----- diversity: 1.0
----- Generating with seed: "s stupid, fat rat yellow." 

he waved hi"
s stupid, fat rat yellow." 

he waved his wird behind the first years feele as he could. 

"you two we've got to come, it," he said forward. "than is -- all yelled your father arrivy yeh on, i was doing out what he's spind, too. it was a sistack start chiven's copparte cloak, he knew that gient stuck in his eyes on the door. 

"well vernon, his if it was doing," he said suddenly and schoollegs and school riding stepped on the dangest as

